Problem 1

Initialization

This step loads the required libraries and the dataset:

#Required for plotting
options(bitmapType = "cairo")

#Load libraries
library(dplyr,      quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)
library(Seurat,     quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(patchwork,  quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE)

# Load dataset
pbmc <- CreateSeuratObject(
                #Data directory
                counts = Read10X(data.dir = "./data/filtered_matrices_mex/hg19/"),
                #Dataset name
                project = "pbmc68k",
                #Omit features detected in <5 cells
                min.cells = 5,
                #Omit cells with <200 featues
                min.features = 200
            )
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')

Pre-processing

This step carries out the 4-stage preprocessing of the dataset: QC, normalization, feature selection, centering and scaling.

Stage 1: QC

First, QC metrics are visualized:

#Get QC metrics
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

#Plot QC metrics
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt") +
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") 

Based on them, inferior-quality data is filtered out of the dataset:

pbmc <- subset(pbmc,
                #Filter out data from cells which:
                #1. have less than 300 or more than 2000 features, or
                #2. have more than 4% mitochondrial counts
                subset = nFeature_RNA > 300 & nFeature_RNA < 2000 & percent.mt < 4)

Stage 2: Normalization

pbmc <- NormalizeData(pbmc)
## Normalizing layer: counts

Stage 3: Feature selection

pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000)
## Finding variable features for layer counts
#Plot the relevant features
varfts_plot <- VariableFeaturePlot(pbmc)
varfts_plot + LabelPoints(plot = varfts_plot, points = head(VariableFeatures(pbmc), 10), repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

Stage 4: Centering/scaling

Due to ceaseless OOM errors in this step, I had to restrict the scaled fetures to only the variable features (by omitting the features argument passed to Seurat::ScaleData()). This shouldn’t affect the PCA and clustering results, but the heatmaps (see below) will likely be off. I’ve included them nonetheless.

pbmc <- ScaleData(pbmc,
                    #Helps prevent OOM errors
                    block.size=1)
## Centering and scaling data matrix

PCA

In this step, PCA of the pre-processed data is carried out.

First, carry out the PCA proper:

#Carry out the PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1 
## Positive:  CST3, SPI1, SERPINA1, LST1, FCN1, LYZ, CFD, RP11-290F20.3, CD68, IFI30 
##     MS4A7, PILRA, HCK, AIF1, TMEM176B, TYMP, FCER1G, LRRC25, CFP, HLA-DRB1 
##     HLA-DRA, TYROBP, HLA-DRB5, HLA-DPA1, S100A9, IGSF6, HMOX1, LILRB2, HLA-DPB1, C19orf38 
## Negative:  RPS6, RPL13, RPS18, RPS2, LTB, IL32, RPL34, CD7, IL7R, CCR7 
##     JUN, CD8B, AQP3, RGCC, RPLP1, EIF3E, PASK, RP11-291B21.2, CTSW, NGFRAP1 
##     CD8A, GZMK, MYC, CCL5, FKBP11, RP11-664D1.1, HIST1H1D, CRIP2, DUSP2, JUNB 
## PC_ 2 
## Positive:  LTB, RPL13, RPS2, CD79A, RPS18, RPS6, RPL34, HLA-DRA, RPLP1, TCL1A 
##     CD79B, MS4A1, HLA-DQA1, LINC00926, JUNB, VPREB3, FCER2, HLA-DQA2, HLA-DMA, LY86 
##     HLA-DMB, HLA-DOB, SPIB, BANK1, HVCN1, CCR7, FCRLA, HLA-DQB1, EAF2, HLA-DPA1 
## Negative:  NKG7, GNLY, CST7, GZMB, GZMA, FGFBP2, CCL5, GZMH, PRF1, CTSW 
##     HOPX, CLIC3, SPON2, FCGR3A, CCL4, TYROBP, SRGN, MATK, PRSS23, CD63 
##     S1PR5, PFN1, KLRD1, IGFBP7, GPR56, TMSB4X, S100A4, FCRL6, C1orf21, IFITM2 
## PC_ 3 
## Positive:  CD79A, MS4A1, TCL1A, CD79B, HLA-DRA, HLA-DQA1, HLA-DPB1, HLA-DRB1, CD74, HLA-DPA1 
##     HLA-DQA2, HLA-DRB5, LINC00926, VPREB3, FCER2, SPIB, HLA-DMB, HLA-DQB1, HLA-DMA, HLA-DOB 
##     BANK1, PDLIM1, FCRLA, HVCN1, IRF8, BLK, GZMB, LY86, CD22, PNOC 
## Negative:  JUNB, RP11-290F20.3, SERPINA1, AIF1, CFD, RPS6, MS4A7, TMEM176B, PILRA, IL7R 
##     RPL13, FCN1, RPL34, RPS2, LST1, HES4, LILRB2, CD68, CDKN1C, IFI30 
##     LRRC25, RPS18, HMOX1, GPBAR1, LTB, C1QA, VMO1, CSF1R, CFP, MAFB 
## PC_ 4 
## Positive:  RPS2, RPL13, RPS6, FCGR3A, RPS18, RP11-290F20.3, MS4A7, CD79B, CD79A, TCL1A 
##     NKG7, SERPINA1, MS4A1, GNLY, HES4, HMOX1, LINC00926, PILRA, CFD, CDKN1C 
##     VMO1, FCER2, VPREB3, TESC, FGFBP2, HVCN1, CST7, IFITM3, LILRA3, LRRC25 
## Negative:  PPBP, CLU, GNG11, PF4, SDPR, NRGN, PTCRA, HIST1H2AC, TUBB1, GP9 
##     ACRBP, SPARC, CMTM5, TREML1, NCOA4, RUFY1, ITGA2B, SNCA, RGS18, MPP1 
##     AP001189.4, MYL9, CD9, CLDN5, LMNA, AC147651.3, RAB32, GPX1, GSN, MAP3K7CL 
## PC_ 5 
## Positive:  MS4A7, RP11-290F20.3, CD79B, VMO1, CD79A, FCGR3A, CDKN1C, HMOX1, HES4, CD68 
##     HIST1H2AC, CKB, TCL1A, MS4A1, ICAM4, LYN, PPBP, GNG11, LILRA3, SDPR 
##     SERPINA1, PF4, SIGLEC10, C5AR1, PILRA, CLU, TESC, MPP1, LYL1, LINC00926 
## Negative:  FCER1A, LGALS2, CLEC10A, LYZ, ALDH2, S100A8, CPVL, CD1C, ENHO, MS4A6A 
##     S100A9, GRN, CST3, CD14, RNASE6, RPS2, CAPG, AMICA1, IGSF6, S100A4 
##     BLVRB, CD33, LGALS1, HLA-DMA, CACNA2D3, RPLP1, SERPINF1, CD1D, TMSB10, FCGRT
#Plot PCA results
VizDimLoadings(pbmc, dims = 1:2, reduction = 'pca')

DimPlot(pbmc, reduction = 'pca')

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

Next, find which of the principal components are actually significant:

#Carry out JackStraw
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)

#Plot JackStraw results
JackStrawPlot(pbmc, dims = 1:20)
## Warning: Removed 28718 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Also produce an elbow plot
ElbowPlot(pbmc)

Based on this, I assumed a cutoff between the 10th and 11th PCs.

Clustering & tSNE

This step carries out the clustering and tSNE of the dataset.

Firstly, clustering:

pbmc <- FindNeighbors(pbmc, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc,
                        #Value picked to net 10 clusters
                        resolution = 0.15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 65251
## Number of edges: 1769153
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9518
## Number of communities: 10
## Elapsed time: 58 seconds

Secondly, tSNE:

pbmc <- RunUMAP(pbmc, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 23:01:42 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:01:42 Read 65251 rows and found 10 numeric columns
## 23:01:42 Using Annoy for neighbor search, n_neighbors = 30
## 23:01:42 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:01:58 Writing NN index file to temp file /tmp/RtmpnrYO0C/file64da12d4fc56
## 23:01:58 Searching Annoy index using 1 thread, search_k = 3000
## 23:03:13 Annoy recall = 100%
## 23:03:15 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:03:19 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:03:31 Commencing optimization for 200 epochs, with 2786446 positive edges
## 23:04:36 Optimization finished

Lastly, the results of tSNE are visualized as in Zheng et al. 2017.

#Retrieve the number of cells in each cluster
ccc <- table(Idents(pbmc))

#Transform absolute counts into percentages
ccc <- ccc / sum(ccc) * 100

#Append cluster IDs to percentages
ccc <- sprintf("%s: %.2f%%", levels(pbmc), ccc)

#Apply these percentages as cluster names
names(ccc) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, ccc)

#Plot
DimPlot(pbmc, reduction = 'umap', label=TRUE) + NoLegend()

#Restore nondescript cluster names
oldcls <- names(ccc)
names(oldcls) <- as.character(ccc)
pbmc <- RenameIdents(pbmc, oldcls)

Problem 2

Here, each of the 10 clusters is independently re-clustered.

The following helper function will be used to automatically re-cluster and plot each cluster:

#Arguments:
#   cluster:    Name of the cluster to re-cluster
#   resolution: Passed to Seurat::FindClusters(); can be changed to alter the number of created sub-clusters
recluster <- function(cluster, resolution) {
    
    #Retrieve the requested cluster

    cl <- subset(x = pbmc, idents = cluster)

    #Carry out clustering and tSNE
    cl <- FindNeighbors(cl, dims = 1:10, verbose=FALSE)
    cl <- FindClusters(cl, resolution=resolution, verbose=FALSE)
    cl <- RunUMAP(cl, dims = 1:10, verbose=FALSE)

    #Format sub-cluster names
    ccc <- table(Idents(cl))
    ccc <- ccc / sum(ccc) * 100
    ccc <- sprintf("%s/%s: %.2f%%", cluster, levels(cl), ccc)
    names(ccc) <- levels(cl)
    cl <- RenameIdents(cl, ccc)

    #Plot
    return(DimPlot(cl, reduction = 'umap', label=TRUE) + NoLegend())
}

Now, each cluster will be re-clustered proper.

Cluster no. 0, for 2 sub-clusters:

recluster("0", 0.15)

Cluster no. 1, for 2 sub-clusters:

recluster("1", 0.2)

Cluster no. 2, for 3 sub-clusters:

recluster("2", 0.2)

Cluster no. 3, for 1 sub-cluster, so no actual further clustering:

recluster("3", 0.1)

Cluster no. 4, for 2 sub-clusters:

recluster("4", 0.1)

Cluster no. 5, for 3 sub-clusters:

recluster("5", 0.1)

Cluster no. 6, for 4 sub-clusters:

recluster("6", 0.2)

Cluster no. 7, for 2 sub-clusters:

recluster("7", 0.25)

Cluster no. 8, for 2 sub-clusters:

recluster("8", 0.1)

Cluster no. 9, for 2 sub-clusters:

recluster("9", 0.5)